##merge two face datasets and analyze
rm(list = ls())
library(reshape2)
library(stringr)
library(car)
library(apsrtable)
library(stringr)
library(Zelig)


cl   <- function(dat,fm, cluster){
  require(sandwich, quietly = TRUE)
  require(lmtest, quietly = TRUE)
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- fm$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
  uj = uj[complete.cases(uj),]  ##modify this function so that it drops NA's become cities don't show up
  
  vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
  return(vcovCL) }
                                   
all_content = readLines('../data/researcher_data_HDLMS_Face_Stimuli_Ahuja_May_2016_4.csv')
skip_second = all_content[-2]
dat = read.csv(textConnection(skip_second), header = TRUE, stringsAsFactors = FALSE)
dat$time = difftime(strptime(dat$V9,"%Y-%m-%d %H:%M:%S"),strptime(dat$V8,"%Y-%m-%d %H:%M:%S"))
dat = subset(dat, is.na(dat$time)==F)
dat1 = subset(dat, dat$time > quantile(dat$time,.05) & dat$time < quantile(dat$time,.95))
#dat1 = dat
print(nrow(dat1))

##
all_content = readLines('../data/Face_Stimuli_2.csv')
skip_second = all_content[-2]
dat = read.csv(textConnection(skip_second), header = TRUE, stringsAsFactors = FALSE)
dat$time = difftime(strptime(dat$V9,"%Y-%m-%d %H:%M:%S"),strptime(dat$V8,"%Y-%m-%d %H:%M:%S"))
dat = subset(dat, is.na(dat$time)==F)
##hack to get rid of bad elemetn
dat = dat[dat$Q1_101_TEXT!=5,]
dat2 = subset(dat, dat$time > quantile(dat$time,.05) & dat$time < quantile(dat$time,.95))
#dat2 = dat

print(nrow(dat2))

##checking for overlapping names
dat1.keep = names(dat1)[names(dat1)%in%names(dat2)]
dat2.keep = names(dat2)[names(dat2)%in%names(dat1)]

dat1 = dat1[,dat1.keep]
dat2 = dat2[,dat2.keep]

dat1$study = 1
dat2$study = 2

dat = rbind(dat1,dat2)
#dat = dat2

print(nrow(dat))

#dat = dat[dat$Race == 1,]
#dat = subset(dat, race == 1)
dat = subset(dat, Race != 2)
#dat = dat[dat$party == 3,]
#dat = dat[dat$hisp == 1,]

##ordering
#dat$asian.first = ifelse(dat$DO.BR.FL_5 == 'asians|blacks',1,0)
#dat = dat[dat$asian.first == 1,]

##msa.dat
metro.dat = read.csv('../data/2010 Dissimilarity Metro Area.csv')
micro.dat = read.csv('../data/Micro Seg 2010.csv')
micro.pop = read.csv('../data/nhgis0048_ds176_20105_2010_cbsa.csv')  ##micro populaition data

metro.dat$metroid = as.numeric(gsub(',','',metro.dat$metroid))
metro.dat$m_t_a_10 = as.numeric(gsub(',','',metro.dat$m_t_a_10))
metro.dat$m_w_a_10 = as.numeric(gsub(',','',metro.dat$m_w_a_10))
metro.dat$m_b_a_10 = as.numeric(gsub(',','',metro.dat$m_b_a_10))
metro.dat$percent.black = metro.dat$m_b_a_10/metro.dat$m_w_a_10
metro.dat$percent.black = ifelse(is.na(metro.dat$percent.black) == T,0, metro.dat$percent.black)

###merge micro data
micro.dat = merge(micro.dat,
                  micro.pop,
                  by.x = 'ma',
                  by.y = 'CBSAA',
                  all.x = T,
                  all.y = F)

micro.dat$percent.black = micro.dat$JMJE004/micro.dat$JMJE003

##get states
state.temp.metro = str_split_fixed(metro.dat$metroname, ", ", 2)[,2]
state.temp.metro = substr(state.temp.metro,1,2)
metro.dat$state = state.temp.metro

micro.dat$maname = as.character(micro.dat$maname)
micro.dat$state = substr(micro.dat$maname,nchar(micro.dat$maname)-1,nchar(micro.dat$maname))

metro.dat = metro.dat[,c('metroid','state','m_dwb_a_10','percent.black')]
micro.dat = micro.dat[,c('ma','state','dbnb10','percent.black')]

colnames(metro.dat) = c('metroid','state','dissimilarity','percent.black')
metro.dat$dissimilarity = metro.dat$dissimilarity/100 ##rescale because this is in units of 0-100
colnames(micro.dat) = c('metroid','state','dissimilarity','percent.black')

cbsa.dat = rbind(metro.dat,micro.dat)

south = c('DE','MD','WV','VA','KY','NC','TN','AR','OK','TX','LA','MS','AL','GA','FL','SC','NC')

cbsa.dat$south = ifelse(cbsa.dat$state%in%south,1,0)

##use bridge data to merge zips to csv
zips.mpsa = read.csv('../data/ZIP_CBSA_092015.csv')

cbsa.dat = merge(cbsa.dat,
                 zips.mpsa,
                 by.x = 'metroid',
                 by.y = 'CBSA',
                 all.x = T,
                 all.y = F)


dat = dat[is.na(dat$Zip.Code)==F,]

dat = merge(dat,
            cbsa.dat,
            by.x = 'Zip.Code',
            by.y = 'ZIP',
            all.x = T,
            all.y = F)

dat$college.educated = ifelse(dat$Education>5,1,0)
dat$income.recode = Recode(dat$Salary,
                           "1 = 1500;
                            2 = 4000;
                            3 = 6250;
                            4 = 8750;
                            5 = 10500;
                            6 = 11750;
                            7 = 13750;
                            8 = 16000;
                            9 = 18500;
                            10 = 21000;
                            11 = 23500;
                            12 = 27500;
                            13 = 32500;
                            14 = 37500;
                            15 = 42500;
                            16 = 47500;
                            17 = 55000;
                            18 = 67500;
                            19 = 82500;
                            20 = 95000;
                            21 = 105000;
                            22 = 115000;
                            23 = 127500;
                            24 = 142500;
                            25 = 150000;
                            26 = NA")
dat$income.recode[is.na(dat$income.recode)==T] = mean(dat$income.recode,na.rm = T)
dat$conservative = ifelse(dat$attitude_self %in% c(5,6,7),1,0)

###when I merge with dat, maybe need to disambiguate later, 
#3because some zips in more than one mpsa
###but it's fine for now.

##recode questions 
race.questions = grep('Q1',names(dat))

##cycle through each and use function
for(i in race.questions){
  for(j in 1:nrow(dat)){
    dat[j,i] = ifelse(dat[j,i] == 'Black',1,dat[j,i])
    dat[j,i] = ifelse(dat[j,i] == 'White',0,dat[j,i])
    dat[j,i] = ifelse(dat[j,i] == '',NA,dat[j,i])
  }
  dat[,i] = as.numeric(dat[,i])
}

##summarize
dat$black.perception = rowSums(dat[,race.questions],na.rm=F)/length(race.questions)

print(summary(lm(black.perception~dissimilarity+percent.black,
                 data = dat[dat$Race==1,])))

print(summary(lm(black.perception~dissimilarity,
                 data = dat[dat$Party==3,])))


###fancier
pse.dat = dat[,race.questions]
#test = dat[,!race.questions%in%c(0,1)]
#install.packages('fechner')
#library(fechner)

pses = vector(length = nrow(pse.dat))

##try this by hand replicating code
xs = seq(from = 0, to = 1, by = .1)
xs = rep(xs,each = 10)
for(i in 1:nrow(pse.dat)){
  print(i)
  ys = t(pse.dat[i,])
  is.complete = all(is.na(ys))
  if(is.complete == F){
    glm.out = glm(ys~xs, family=binomial(link='logit'))
    ##create linear vector
    lvector = seq(from = 0, to = 1, length.out = 100)
    ##fit between model and vector
    fits = predict(glm.out,new.data = lvector,
                   type = 'response')
    pses[i] = which.min(abs(fits-.5))
  }
  else{
    pses[i] = NA
  }
}

pses = pses/100
t.test(pses,mu = .5)

dat$pse = pses


##PREDICTION IS A NEGATIVE RELATIONSHIP!!!!!!
print(summary(lm(pse~dissimilarity+attitude_self,
                 data = dat)))


print(summary(lm(pse~dissimilarity+percent.black,
                 data = dat)))

print(summary(lm(pse~percent.black,
                 data = dat)))


t.test(dat$pse[dat$attitude_self %in% c(1,2,3)],
       dat$pse[dat$attitude_self %in% c(5,6,7)])

dat$contact = ((dat$contact_school+dat$contact_neighbors+dat$contact_friends)-21)/3
dat$age = 2016-dat$Birth.Year

print(summary(lm(pse~dissimilarity+percent.black+attitude_self+age,
                 data = dat[dat$Race==1,])))

print(summary(lm(pse~dissimilarity*percent.black+attitude_self+contact+age,
                 data = dat)))


##

##see knowles email on racial essentialism shifting rate black
##looking at acuity

rq1 = race.questions[1:10]
rq2 = race.questions[11:20]
rq3 = race.questions[21:30]
rq4 = race.questions[31:40]
rq5 = race.questions[41:50]
rq6 = race.questions[51:60]
rq7 = race.questions[61:70]
rq8 = race.questions[71:80]
rq9 = race.questions[81:90]
rq10 = race.questions[91:100]
rq11 = race.questions[101:110]

dat$rq1.v = apply(dat[,rq1],1,sd,na.rm=T)
dat$rq2.v = apply(dat[,rq2],1,sd,na.rm=T)
dat$rq3.v = apply(dat[,rq3],1,sd,na.rm=T)
dat$rq4.v = apply(dat[,rq4],1,sd,na.rm=T)
dat$rq5.v = apply(dat[,rq5],1,sd,na.rm=T)
dat$rq6.v = apply(dat[,rq6],1,sd,na.rm=T)
dat$rq7.v = apply(dat[,rq7],1,sd,na.rm=T)
dat$rq8.v = apply(dat[,rq8],1,sd,na.rm=T)
dat$rq9.v = apply(dat[,rq9],1,sd,na.rm=T)
dat$rq10.v = apply(dat[,rq10],1,sd,na.rm=T)
dat$rq11.v = apply(dat[,rq11],1,sd,na.rm=T)

dat$rq.v = apply(dat[,race.questions],1,sd,na.rm=T)
dat$rq.v = apply(dat[,race.questions[50:70]],1,sd,na.rm=T)


###pse
white.dat = dat[dat$Race == 1,]

white.dat = white.dat[complete.cases(white.dat[,c('pse','dissimilarity','percent.black','income.recode','college.educated','contact','south')]),]

white.dat1 = white.dat[white.dat$study==1,]
white.dat2 = white.dat[white.dat$study==2,]

white.dat1.nonsouth = white.dat[white.dat$study==1&white.dat$south==0,]
white.dat2.nonsouth = white.dat[white.dat$study==2&white.dat$south==0,]


dat = dat[dat$Race == 1,]

dat = dat[complete.cases(dat[,c('pse','dissimilarity','percent.black','income.recode','college.educated','contact','south')]),c('pse','dissimilarity','percent.black','income.recode','college.educated','contact','metroid','south','study')]


write.csv(dat,'TablesA17A18A19A20_data.csv',row.names = F)